rm(list=ls())
source("functions.R")
quanteda_options(threads = 8)

dict <- dictionary(file = "topic_low-entropy.yml")
toks <- readRDS("data/data_tokens_sent.RDS") %>% tokens_subset(year >= 1991)
toks <- tokens_compound(toks, dict, join = FALSE, concatenator = " ")

# Train
mt_key <- dfm(tokens_lookup(toks, dict, levels = 1))
mt <- dfm(toks, remove = c("", stopwords("en"))) %>% 
    dfm_trim(min_termfreq = 10)
map <- textmodel_newsmap(mt, mt_key)

# Test
mt_coded <- dfm_subset(mt, !is.na(topic_human))
dat <- docvars(mt_coded)
dat$topic <- predict(map, newdata = mt_coded)
summary(test_accuracy(dat))

# Raw
pred <- list()
for (d in levels(droplevels(dat$docname))) {
    cat(d, "\n")
    pred[[d]][["raw"]] <- subset(dat, docname == d)
}

# Smooth
for (d in levels(droplevels(dat$docname))) {
    cat(d, "\n")
    mt_temp <- dfm_subset(mt_coded, docname == d)
    mt_pred <- as.matrix(predict(map, newdata = mt_temp, type = "all"))
    mt_pred[is.na(mt_pred)] <- 0
    mt_smooth <- kernapply(mt_pred, kernel("daniell", c(3, 3)))
    human_topic <- docvars(mt_temp, "topic_human")
    names(human_topic) <- docnames(mt_temp)
    attr(mt_smooth, "topic_human")  <-  human_topic[rownames(mt_smooth)]
    pred[[d]][["smooth"]] <- mt_smooth
}

saveRDS(pred, "data_prediction_country.RDS")
